* CGK_Imputed.do: Imputes missing values.
*
* Coibion-Gorodnichenko-Kueng; Initial version: March 2012; this version: May 2017


global data = "C:\Users\nfumi\Desktop\HANKempirical\InnocentBystandersreplication_folder\source_files\build files for CEX"
global home = "$data"
global savetype "saveold"

set seed 123456789
set more off


			*********************************
			* Prepare Imputation Regression *
			*********************************

*
use "$data/CGK_Incomenew.dta" ,clear

destring REF_RACE, replace

* re-classify imputation regressors to have enough observations per cell
gen AGE_REFsq = AGE_REF^2
tab REF_RACE 
qui:replace REF_RACE=7 if REF_RACE>2
label var REF_RACE "race; 1: white, 2:black, 7:other race"
tab REF_RACE 
tab EDUC_REF
qui:replace EDUC_REF=8 if EDUC_REF==1 | EDUC_REF==7
label var EDUC_REF "education: 8 means first through eith grade (1) or no school (7)"
tab EDUC_REF
tab INCWEEK1 
qui:replace INCWEEK1=20 if INCWEEK1>0  & INCWEEK1<=20
qui:replace INCWEEK1=40 if INCWEEK1>20 & INCWEEK1<=40
qui:replace INCWEEK1=51 if INCWEEK1>40 & INCWEEK1<=51
label var INCWEEK1 "# weeks ref person worked: 0 [1 20] [21 40] [41 51] 52"
tab INCWEEK1 
tab FAM_SIZE 
qui:replace FAM_SIZE =6 if FAM_SIZE >5
label var FAM_SIZE  "family size: 6 means more than 5"
tab FAM_SIZE  
tab PERSLT18 
qui:replace PERSLT18 =3 if PERSLT18 >2
label var PERSLT18  "# persons under 18: 3 means more than 2"
tab PERSLT18  
tab PERSOT64 
qui:replace PERSOT64 =2 if PERSOT64 >1
label var PERSOT64  "# persons over 64: 2 means more than 1"
tab PERSOT64  
tab NO_EARNR 
qui:replace NO_EARNR =2 if NO_EARNR >1
label var NO_EARNR  "# of earners: 2 means more than 1"
tab NO_EARNR  

* replace zeros with missing values for aggregated MEMB variables
replace FSALARYXrb=. if FSALARYXimpI==1
replace FNONFRMXrb=. if FNONFRMXimpI==1
replace FFRMINCXrb=. if FFRMINCXimpI==1
replace FRRETIRXrb=. if FRRETIRXimpI==1
replace FSSIXr    =. if FSSIXimpI   ==1
replace FAMTFEDXr =. if FAMTFEDXimpI==1
replace FSLTAXXr  =. if FSLTAXXimpI ==1
drop *impI

* create categorical/factor variables to speed up imputation regression below
xi i.SEX_REF i.REF_RACE i.EDUC_REF i.INCWEEK1 i.FAM_SIZE i.PERSLT18 i.PERSOT64 i.NO_EARNR

compress
sort intdate NEWIDunique
label data "raw (index 1 and 3) and imputed data (index 2) for Coibion-Gorodnichenko-Kueng"
$savetype "$data/CGK_Imputednew.dta",replace
*/









			**************
			* Imputation *
			**************

use "$data/CGK_Imputednew.dta",clear


* ======= start LHS-loop ===========
foreach LHS in ///
	 FSALARYXrb FNONFRMXrb FFRMINCXrb FRRETIRXrb FSSIXr FAMTFEDXr FSLTAXXr /// MEMB income and tax variables
	 UNEMPLXrb COMPENSXrb WELFAREXrb INTEARNX FININCXrb PENSIONXrb INCLOSSArb INCLOSSBrb OTHRINCXrb FOODSMPXrb INCCONTXrb FEDTAXX SLOCTAXX TAXPROPX FEDRFNDX SLRFUNDX MISCTAXX OTHRFNDX { // FMLY income and tax variables

	#delim;
	cap:drop yhat;     gen yhat=.;
	cap:drop rres;     gen rres=.;
	cap:drop ssort;    gen ssort=.;
	cap:drop sseq;     gen sseq=.;
	cap:drop sseq_res; gen sseq_res=.;
	cap:drop LHSlevel; gen LHSlevel =`LHS'; /*save original variable before log transformation*/

	forvalues y=2011(1)2013 {; 

		  /* deal with Food Stamps which are missing in 1983 and 1984 */
		  if "`LHS'"=="FOODSMPXrb" & (`y'==1983|`y'==1984) {;
			local y=1982;
		  };

		  di _n(1)"`y'"_n(1);
			
		  /* index cases - 1: non-zero non-missing values, 2: missing values to be imputed (known to be non-zero); 3: zeros */
		  replace ssort = 1*(`LHS'!=0 & `LHS'!=.) + 2*(`LHS'==.) + 3*(`LHS'==0) if QINTRVYR==`y';
		  tab ssort if QINTRVYR==`y';

		  /* log transformation for variables which are non-negative */
		  if "`LHS'"=="FNONFRMXrb"|"`LHS'"=="FFRMINCXrb"|"`LHS'"=="INCLOSSArb"|"`LHS'"=="INCLOSSBrb" {;
				  /* do nothing for variables that can be negative */
		  };
		  else {; /* log transformation for non-negative variables */
			gen temp =ln(`LHS')  if ssort==1 & QINTRVYR==`y';
			replace `LHS' = temp if ssort==1 & QINTRVYR==`y'; drop temp;
		  };
		  
		  /* estimate imputation model */
		  areg `LHS' AGE_REF AGE_REFsq _I* [fw=fwt] if ssort==1 & QINTRVYR==`y', absorb(intdate) cluster(NEWIDunique);
		  predict temp if ssort==1 & QINTRVYR==`y',res; replace rres=temp if ssort==1 & QINTRVYR==`y'; drop temp; /* residuals */
		  predict temp if ssort==2 & QINTRVYR==`y',xb ; replace yhat=temp if ssort==2 & QINTRVYR==`y'; drop temp; /* linear prediction */

		  /* draw randomly with replacement from residuals */
		  count if ssort==1 & QINTRVYR==`y'; local n=r(N);
		  egen temp = seq()               if ssort==1 & QINTRVYR==`y'; replace sseq_res=temp if ssort==1 & QINTRVYR==`y'; drop temp; /* residual index {1,...,n} */
		  gen  temp = ceil(`n'*uniform()) if ssort==2 & QINTRVYR==`y'; replace sseq    =temp if ssort==2 & QINTRVYR==`y'; drop temp; /* random replacement index with elements in {1,...,n} */
	};
	#delim cr

	* merge residuals to missing values using the random replacement index
	preserve
	keep QINTRVYR sseq_res rres
	drop if rres==.   // keep least-squares residuals
	ren sseq_res sseq // map residual index to random replacement index
	sort QINTRVYR sseq
	$savetype "$data/tempfiles/residualsnew.dta",replace
	restore
	sort QINTRVYR sseq
	merge m:m QINTRVYR sseq using "$data/tempfiles/residualsnew.dta", update // many-to-many merge since there are typically more valid than missing data, but the random replacement index can have duplicate values
	drop if _merge==2 // drop non-matched residuals
	drop _merge

	* impute values
	replace `LHS' = LHSlevel       if `LHS'!=.
	  if "`LHS'"=="FNONFRMXrb"|"`LHS'"=="FFRMINCXrb"|"`LHS'"=="INCLOSSArb"|"`LHS'"=="INCLOSSBrb" {
	replace `LHS' = yhat+rres      if `LHS'==.
	  }
	  else {
	replace `LHS' = exp(yhat+rres) if `LHS'==.
	  }

	* trim top-coded values (otherwise the standard deviation is inflated by the (out-of-sample) imputation)
	forvalues y=2011(1)2013 {
	  sum     `LHS'          if ssort==1 & QINTRVYR==`y' 
	  replace `LHS' = r(max) if ssort==2 & QINTRVYR==`y' & `LHS'>r(max)
	  replace `LHS' = r(min) if ssort==2 & QINTRVYR==`y' & `LHS'<r(min)
	}

	* save imputation indicator
	gen `LHS'ii = ssort
	label var `LHS'ii "imputation indicator of `LHS'. 1: non-zero value, 2: imputed value, 3: zero"
	label var `LHS'   "`LHS': original and imputed values"
}
*${savetype},replace 
cap n rm "$data/tempfiles/residualsnew.dta"
* ======= end LHS-loop ===========

* cleaning up
drop yhat rres ssort sseq* LHSlevel _I*


save temp_, replace


* construct family income variables
gen salary      = FSALARYXrb if FSALARYXrbii!=2
gen salaryIMP   = FSALARYXrb
gen business    = FNONFRMXrb + FFRMINCXrb if (FNONFRMXrbii!=2 & FFRMINCXrbii!=2)
gen businessIMP = FNONFRMXrb + FFRMINCXrb
gen finance     = FININCXrb + INTEARNX + PENSIONXrb + INCLOSSArb + INCLOSSBrb if (FININCXrbii!=2 & INTEARNXii!=2 & PENSIONXrbii!=2 & INCLOSSArbii!=2 & INCLOSSBrbii!=2)
gen financeIMP  = FININCXrb + INTEARNX + PENSIONXrb + INCLOSSArb + INCLOSSBrb
gen otherinc    = FRRETIRXrb + FSSIXr + UNEMPLXrb + COMPENSXrb + WELFAREXrb + INCCONTXrb + OTHRINCXrb if (FRRETIRXrbii!=2 & FSSIXrii!=2 & UNEMPLXrbii!=2 & COMPENSXrbii!=2 & WELFAREXrbii!=2 & INCCONTXrbii!=2 & OTHRINCXrbii!=2)
gen otherincIMP = FRRETIRXrb + FSSIXr + UNEMPLXrb + COMPENSXrb + WELFAREXrb + INCCONTXrb + OTHRINCXrb 
gen foodstamp   = FOODSMPXrb if FOODSMPXrbii!=2
gen foodstampIMP= FOODSMPXrb
gen fedtax      = FAMTFEDXr + FEDTAXX - FEDRFNDX if (FAMTFEDXrii!=2 & FEDTAXXii!=2  & FEDRFNDX!=2)
gen fedtaxIMP   = FAMTFEDXr + FEDTAXX - FEDRFNDX
gen sltax       = FSLTAXXr + SLOCTAXX - SLRFUNDX if (FSLTAXXrii!=2  & SLOCTAXXii!=2 & SLRFUNDX!=2)
gen sltaxIMP    = FSLTAXXr + SLOCTAXX - SLRFUNDX
gen othertax    = TAXPROPX + MISCTAXX - OTHRFNDX if (TAXPROPXii!=2  & MISCTAXXii!=2 & OTHRFNDX!=2)
gen othertaxIMP = TAXPROPX + MISCTAXX - OTHRFNDX
gen incomebt    = salary + business + finance + otherinc
gen incomebtIMP = salaryIMP + businessIMP + financeIMP + otherincIMP
gen totaltax    = fedtax + sltax + othertax
gen totaltaxIMP = fedtaxIMP + sltaxIMP + othertaxIMP
gen incomeat    = incomebt - totaltax
gen incomeatIMP = incomebtIMP - totaltaxIMP

* order data
aorder
order NEWIDunique intno intdate QINTRV* fwt FINLWT21 RESPSTAT ///
 incomebt incomebtIMP incomeat incomeatIMP totaltax totaltaxIMP salary salaryIMP business businessIMP finance financeIMP otherinc otherincIMP foodstamp foodstampIMP fedtax fedtaxIMP sltax sltaxIMP othertax othertaxIMP ///
 FSALARYXrb FNONFRMXrb FFRMINCXrb FRRETIRXrb FSSIXr FAMTFEDXr FSLTAXXr /// MEMB income and tax variables
 UNEMPLXrb COMPENSXrb WELFAREXrb INTEARNX FININCXrb PENSIONXrb INCLOSSArb INCLOSSBrb OTHRINCXrb FOODSMPXrb INCCONTXrb FEDTAXX SLOCTAXX TAXPROPX FEDRFNDX SLRFUNDX MISCTAXX OTHRFNDX /// FMLY income and tax variables


* label and save data
label var NEWIDunique "unique household identifier"
label var intno "interview number"
label var intdate "interview date (monthly)"
label var QINTRVMO "interview month"
label var QINTRVYR "interview year"
label var fwt "sample weights for STATA (integer of FINLWT21)"
label var FINLWT21 "sample weights"
label var RESPSTAT "full income response indicator: 1 yes, 2 no"
label var incomebt "before-tax income (except food stamps)"
label var incomeat "after-tax income (except food stamps)"
label var totaltax "total tax liabilites"
label var salary "salary income"
label var business "farm and business income"
label var finance "financial income: interest, dividends, pensions and annuities, etc."
label var otherinc "all other income except foodstamps"
label var foodstamp "foodstamps"
label var fedtax "net federal tax liabilities"
label var sltax "net state and local tax liabilities"
label var othertax "other net tax liabilities"
foreach var in  incomebt incomeat totaltax salary business finance otherinc foodstamp fedtax sltax othertax {
 label var `var'IMP "raw and imputed values of `var'"
}
label var FSALARYXrb "family wage and salary income before deductions (raw and bracketed)"
label var FNONFRMXrb "family income/loss from nonfarm business, partership, or prof. practice (raw and bracketed)"
label var FFRMINCXrb "family owned farm income/loss (raw and bracketed)"
label var FRRETIRXrb "family income from SS and Railroad Retirement before deductions (raw and bracketed)"
label var FSSIXr "family Supplemental Security income (raw only)"
label var FAMTFEDXr "family federal tax withholding (raw only)"
label var FSLTAXXr "family state and local tax withholding (raw only)"
label var UNEMPLXrb "unemployment compensation (raw and bracketed)"
label var COMPENSXrb "worker's compensation and veteran's benefits (raw and bracketed)"
label var WELFAREXrb "public assistance or welfare (raw and bracketed)"
label var INTEARNX "interest on savings accounts or bonds (raw only)"
label var FININCXrb "income from dividends, royalties, estates, or trusts (raw and bracketed)"
label var PENSIONXrb "pensions or annuities - private, military, gov., IRA, Keogh (raw and bracketed)"
label var INCLOSSArb "income/loss from roomers or boarders (raw and bracketed)"
label var INCLOSSBrb "income/loss from payments from other rental units (raw and bracketed)"
label var OTHRINCXrb "other money income, e.g. scholarships, fellowships, stipends, etc. (raw and bracketed)"
label var FOODSMPXrb "values of all food staps (raw and bracketed)"
label var INCCONTXrb "contributions from alimony, child support and other sources (raw and bracketed)"
label var FEDTAXX "federal taxes payed in addition to withholdng (raw only)"
label var SLOCTAXX "state and local taxes payed in addition to withholding (raw only)"
label var TAXPROPX "personal property taxes for vehicles (raw only)"
label var FEDRFNDX "federal income tax refund (raw only)"
label var SLRFUNDX "state and local tax refund (raw only)"
label var MISCTAXX "personal property taxes and other taxes not reported elsewhere (raw only)"
label var OTHRFNDX "other refunds received (raw only)"
foreach var in ///
 FSALARYXrb FNONFRMXrb FFRMINCXrb FRRETIRXrb FSSIXr FAMTFEDXr FSLTAXXr /// MEMB income and tax variables
 UNEMPLXrb COMPENSXrb WELFAREXrb INTEARNX FININCXrb PENSIONXrb INCLOSSArb INCLOSSBrb OTHRINCXrb FOODSMPXrb INCCONTXrb FEDTAXX SLOCTAXX TAXPROPX FEDRFNDX SLRFUNDX MISCTAXX OTHRFNDX /// FMLY income and tax variables
 {
  label var `var'ii "imputation indicator for `var': (1) valid non-zero, (2) imputed, (3) valid zero"
}
label var AGE_REF "age of reference person"
label var AGE_REFsq "age squared of reference person"
label var EDUC_REF "education level of reference person"
label var FAM_SIZE "family size"
label var INCWEEK1 "# of weeks reference person worked"
label var NO_EARNR "# of earners"
label var PERSLT18 "# of persons less than 18"
label var PERSOT64 "# of persons over 64"
label var REF_RACE "race of reference person"
label var SEX_REF "gender of reference person"


* save data
compress
sort intdate NEWIDunique
label data "raw (index 1 and 3) and imputed data (index 2) for Coibion-Gorodnichenko-Kueng"
$savetype "$data/CGK_Imputednew.dta",replace
*/
